https://blog.finxter.com/5-best-ways-to-construct-and-manage-a-tree-in-python/ https://builtin.com/articles/tree-python #This one is more complex https://bigtree.readthedocs.io/en/0.14.8/ #There is this package to create trees, but maybe it is too complex for us Pouly, Marc. “Estimating Text Similarity based on Semantic Concept Embeddings.” arXiv preprint arXiv:2401.04422 (2024).
import torch
import einops
import math
from transformers import AutoModel
# Load the Jina AI embeddings model
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True)
taxonomy_tree = {
'1': {
'2': {
'A': 'Lake',
'B': 'River'
},
'C': 'House',
'3': {
'4': {
'D': 'Mountain',
'E': 'Everest',
'F': 'Volcano'
}
}
}
}
# Function to extract leaf nodes
def get_leaf_nodes(taxonomy):
leaves = {}
def traverse(node, path):
if isinstance(node, dict):
for k, v in node.items():
traverse(v, path + [k])
else:
leaves[path[-1]] = node # Leaf node with its path
traverse(taxonomy, [])
return leaves
# Function to calculate similarity using the Jina AI embeddings model
def calculate_similarity(text1, text2):
# Encode texts to get embeddings
embeddings = model.encode([text1, text2])
# Calculate cosine similarity
sim = torch.nn.functional.cosine_similarity(torch.tensor(embeddings[0]), torch.tensor(embeddings[1]), dim=0)
return sim.item()
# Function to calculate R(T)
def calculate_r_t(taxonomy):
leaves = get_leaf_nodes(taxonomy)
leaf_names = list(leaves.values())
groups = [leaf_names[i:i + 2] for i in range(0, len(leaf_names), 2)] # Grouping pairs
total_groups = len(groups)
r_t_values = []
for group in groups:
# Calculate pairwise similarities within the group
similarities = []
for i in range(len(group)):
for j in range(i + 1, len(group)):
sim = calculate_similarity(group[i], group[j])
similarities.append(sim)
if similarities:
min_similarity = min(similarities)
else:
min_similarity = 0 # No pairs means no intruders possible
# Count intruders
intruder_count = 0
for leaf in leaf_names:
if leaf not in group:
sim_with_group = calculate_similarity(leaf, group[0])
if sim_with_group > min_similarity:
intruder_count += 1
# Calculate R(T) for this group
n_ic = intruder_count
n_gc = len(group)
n_ac = len(leaf_names)
r_t = (1 - (n_ic / (n_gc * (n_ac - n_gc)))) if n_gc * (n_ac - n_gc) > 0 else 0
r_t_values.append(r_t)
return sum(r_t_values) / total_groups if total_groups > 0 else 0
def extract_ncat(taxonomy):
ncat = 0
first_category_found = False # Flag to track if the first category has been encountered
def count_categories(node, is_root=True):
nonlocal ncat, first_category_found
if isinstance(node, dict):
# Only count nodes that are not the root and not leaves
if not is_root:
if not first_category_found:
first_category_found = True # Set the flag after the first category is found
else:
ncat += 1 # Count the intermediate category
print(f"Found category: {list(node.keys())}") # Print the keys of the current category
# Recursively process children, marking them as non-root
for child in node.values():
count_categories(child, is_root=False)
count_categories(taxonomy)
return ncat
def extract_nchar(taxonomy):
nchar = 0
def count_characteristics(node):
nonlocal nchar
if isinstance(node, dict):
for child in node.values():
count_characteristics(child)
else:
nchar += 1 # Count the current characteristic
count_characteristics(taxonomy)
return nchar
def extract_depths_cat(taxonomy):
depths_cat = []
def find_depths(node, depth):
if isinstance(node, dict):
depths_cat.append(depth) # Record the depth of this category
for child in node.values():
find_depths(child, depth + 1)
find_depths(taxonomy, 0) # Start from depth 0
return depths_cat
def extract_depths_char(taxonomy):
depths_char = []
def find_characteristic_depths(node, depth):
if isinstance(node, dict):
for child in node.values():
find_characteristic_depths(child, depth + 1)
else:
depths_char.append(depth) # Record the depth of this characteristic
find_characteristic_depths(taxonomy, 0) # Start from depth 0
return depths_char
import math
def calculate_conciseness(ncat, nchar, depths_cat, depths_char):
"""
Calculate the conciseness of the taxonomy using the proposed formula.
Parameters:
ncat (int): The number of categories.
nchar (int): The number of characteristics.
depths_cat (list): A list of depths for categories.
depths_char (list): A list of depths for characteristics.
Returns:
float: The conciseness value of the taxonomy.
"""
# Calculate the sum of the inverses of the depths for categories and characteristics
# Only include depths greater than 0 to avoid division by zero
sum_cat = sum(1 / d for d in depths_cat if d > 0) if ncat > 0 else 0 # Sum for categories
sum_char = sum(1 / d for d in depths_char if d > 0) if nchar > 0 else 0 # Sum for characteristics
# Calculate the total sum of inverses of depths
total_sum = sum_cat + sum_char
# Calculate conciseness using the provided formula
if total_sum > 0:
C_T = 1 / (1 + math.log(total_sum - 1))
else:
C_T = 0 # Return 0 if total_sum is not positive
return C_T
ncat = extract_ncat(taxonomy_tree)
## Found category: ['A', 'B']
## Found category: ['4']
## Found category: ['D', 'E', 'F']
nchar = extract_nchar(taxonomy_tree)
depths_cat = extract_depths_cat(taxonomy_tree)
depths_char = extract_depths_char(taxonomy_tree)
print("Number of categories (ncat):", ncat)
## Number of categories (ncat): 3
print("Number of characteristics (nchar):", nchar)
## Number of characteristics (nchar): 6
print("Depths of categories:", depths_cat)
## Depths of categories: [0, 1, 2, 2, 3]
print("Depths of characteristics:", depths_char)
## Depths of characteristics: [3, 3, 2, 4, 4, 4]
# Calculate R(T) for the given taxonomy
leaves=get_leaf_nodes(taxonomy_tree)
print(leaves)
## {'A': 'Lake', 'B': 'River', 'C': 'House', 'D': 'Mountain', 'E': 'Everest', 'F': 'Volcano'}
robustness_value = calculate_r_t(taxonomy_tree)
print(f"Robustness R(T): {robustness_value:.4f}")
## Robustness R(T): 0.9583
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')
## The conciseness of the taxonomy is: 0.45899878671895267
new_taxonomy = {
'Cost estimation for GSD': {
'Cost estimation context': {
'Planning': {
"Conceptualization": "Conceptualization",
"Feasibility study": "Feasibility study",
"Preliminary planning": "Preliminary planning",
"Detail Planning": "Detail planning",
"Execution": "Execution",
"Commissioning": "Commissioning"
},
'Project activities': {
"System investigation": "System investigation",
"Analysis": "Analysis",
"Design": "Design",
"Implementation": "Implementation",
"Testing": "Testing",
"Maintenance": "Maintenance",
"Other": "Other"
},
'Project domain': {
"SE": "Systems Engineering",
"Research & Dev": {
"Telecommunication": "Telecommunication"
},
"Finance": "Finance",
"Healthcare": "Healthcare",
"Other": "Other"
},
'Project setting': {
"Close onshore": "Close onshore",
"Distant onshore": "Distant onshore",
"Near offshore": "Near offshore",
"Far offshore": "Far offshore"
},
'Planning approaches': {
"Constructive Cost Model": "Constructive Cost Model",
"Capability Maturity Model Integration": "Capability Maturity Model Integration",
"Agile": "Agile",
"Delphi": "Delphi",
"GA": "Genetic Algorithms",
"CBR": "Case-Based Reasoning",
"Fuzzy similar": "Fuzzy similar",
"Other": "Other"
},
'Number of sites': {
"Value": "Value"
},
'Team size': {
"No of team members": "Number of team members"
}
},
'Estimation technique': {
'Estimation technique': {
"Expert judgment": "Expert judgment",
"Machine learning": "Machine learning",
"Non-machine learning": "Non-machine learning"
},
'Use technique': {
"Individual": "Individual",
"Group-based estimation": "Group-based estimation"
}
},
'Cost estimate': {
'Estimated cost': {
"Estimate value": "Estimated cost value"
},
'Actual cost': {
"Value": "Actual cost value"
},
'Estimation dimension': {
"Effort hours": "Effort hours",
"Staff/cost": "Staff/cost",
"Hardware": "Hardware",
"Risk": "Risk",
"Portfolio": "Portfolio"
},
'Accuracy measure': {
"Baseline comparison": "Baseline comparison",
"Variation reduction": "Variation reduction",
"Sensitivity analysis": "Sensitivity analysis"
}
},
'Cost estimators': {
'Product size': {
"Size report": "Size report",
"Statistics analysis": "Statistics analysis"
},
'Team experience': {
"Considered": "Considered experience",
"Not considered": "Not considered experience"
},
'Team structure': {
"Considered": "Considered structure",
"Not Considered": "Not considered structure"
},
'Product requirement': {
"Performance": "Performance",
"Security": "Security",
"Availability": "Availability",
"Reliability": "Reliability",
"Maintainability": "Maintainability",
"Other": "Other requirement"
},
'Distributed teams distances': {
"Geographical distance": "Geographical distance",
"Temporal distance": "Temporal distance",
"Socio-cultural distance": "Socio-cultural distance"
}
}
}
}
leaves = get_leaf_nodes(new_taxonomy)
print(leaves)
## {'Conceptualization': 'Conceptualization', 'Feasibility study': 'Feasibility study', 'Preliminary planning': 'Preliminary planning', 'Detail Planning': 'Detail planning', 'Execution': 'Execution', 'Commissioning': 'Commissioning', 'System investigation': 'System investigation', 'Analysis': 'Analysis', 'Design': 'Design', 'Implementation': 'Implementation', 'Testing': 'Testing', 'Maintenance': 'Maintenance', 'Other': 'Other requirement', 'SE': 'Systems Engineering', 'Telecommunication': 'Telecommunication', 'Finance': 'Finance', 'Healthcare': 'Healthcare', 'Close onshore': 'Close onshore', 'Distant onshore': 'Distant onshore', 'Near offshore': 'Near offshore', 'Far offshore': 'Far offshore', 'Constructive Cost Model': 'Constructive Cost Model', 'Capability Maturity Model Integration': 'Capability Maturity Model Integration', 'Agile': 'Agile', 'Delphi': 'Delphi', 'GA': 'Genetic Algorithms', 'CBR': 'Case-Based Reasoning', 'Fuzzy similar': 'Fuzzy similar', 'Value': 'Actual cost value', 'No of team members': 'Number of team members', 'Expert judgment': 'Expert judgment', 'Machine learning': 'Machine learning', 'Non-machine learning': 'Non-machine learning', 'Individual': 'Individual', 'Group-based estimation': 'Group-based estimation', 'Estimate value': 'Estimated cost value', 'Effort hours': 'Effort hours', 'Staff/cost': 'Staff/cost', 'Hardware': 'Hardware', 'Risk': 'Risk', 'Portfolio': 'Portfolio', 'Baseline comparison': 'Baseline comparison', 'Variation reduction': 'Variation reduction', 'Sensitivity analysis': 'Sensitivity analysis', 'Size report': 'Size report', 'Statistics analysis': 'Statistics analysis', 'Considered': 'Considered structure', 'Not considered': 'Not considered experience', 'Not Considered': 'Not considered structure', 'Performance': 'Performance', 'Security': 'Security', 'Availability': 'Availability', 'Reliability': 'Reliability', 'Maintainability': 'Maintainability', 'Geographical distance': 'Geographical distance', 'Temporal distance': 'Temporal distance', 'Socio-cultural distance': 'Socio-cultural distance'}
ncat = extract_ncat(new_taxonomy)
## Found category: ['Planning', 'Project activities', 'Project domain', 'Project setting', 'Planning approaches', 'Number of sites', 'Team size']
## Found category: ['Conceptualization', 'Feasibility study', 'Preliminary planning', 'Detail Planning', 'Execution', 'Commissioning']
## Found category: ['System investigation', 'Analysis', 'Design', 'Implementation', 'Testing', 'Maintenance', 'Other']
## Found category: ['SE', 'Research & Dev', 'Finance', 'Healthcare', 'Other']
## Found category: ['Telecommunication']
## Found category: ['Close onshore', 'Distant onshore', 'Near offshore', 'Far offshore']
## Found category: ['Constructive Cost Model', 'Capability Maturity Model Integration', 'Agile', 'Delphi', 'GA', 'CBR', 'Fuzzy similar', 'Other']
## Found category: ['Value']
## Found category: ['No of team members']
## Found category: ['Estimation technique', 'Use technique']
## Found category: ['Expert judgment', 'Machine learning', 'Non-machine learning']
## Found category: ['Individual', 'Group-based estimation']
## Found category: ['Estimated cost', 'Actual cost', 'Estimation dimension', 'Accuracy measure']
## Found category: ['Estimate value']
## Found category: ['Value']
## Found category: ['Effort hours', 'Staff/cost', 'Hardware', 'Risk', 'Portfolio']
## Found category: ['Baseline comparison', 'Variation reduction', 'Sensitivity analysis']
## Found category: ['Product size', 'Team experience', 'Team structure', 'Product requirement', 'Distributed teams distances']
## Found category: ['Size report', 'Statistics analysis']
## Found category: ['Considered', 'Not considered']
## Found category: ['Considered', 'Not Considered']
## Found category: ['Performance', 'Security', 'Availability', 'Reliability', 'Maintainability', 'Other']
## Found category: ['Geographical distance', 'Temporal distance', 'Socio-cultural distance']
nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy)
depths_char = extract_depths_char(new_taxonomy)
print("Number of categories (ncat):", ncat)
## Number of categories (ncat): 23
print("Number of characteristics (nchar):", nchar)
## Number of characteristics (nchar): 62
print("Depths of categories:", depths_cat)
## Depths of categories: [0, 1, 2, 3, 3, 3, 4, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3]
print("Depths of characteristics:", depths_char)
## Depths of characteristics: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
robustness_value = calculate_r_t(new_taxonomy)
print(f"Robustness R(T): {robustness_value:.4f}")
## Robustness R(T): 0.8361
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')
## The conciseness of the taxonomy is: 0.2400686568621428
new_taxonomy = {
'Web Predictor': {
'Size Metric': {
'Length': {
'Web page count': 'Web page count',
'Media count': 'Media count',
'New media count': 'New media count',
'New Web page count': 'New Web page count',
'Link count': 'Link count',
'Program count': 'Program count',
'Reused component count': 'Reused component count',
'Lines of code': 'Lines of code',
'Reused program count': 'Reused program count',
'Reused media count': 'Reused media count',
'Web page allocation': 'Web page allocation',
'Reused lines of code': 'Reused lines of code',
'Media allocation': 'Media allocation',
'Reused media allocation': 'Reused media allocation',
'Entity count': 'Entity count',
'Attribute count': 'Attribute count',
'Component count': 'Component count',
'Statement count': 'Statement count',
'Node count': 'Node count',
'Collection slot size': 'Collection slot size',
'Component granularity level': 'Component granularity level',
'Slot granularity level': 'Slot granularity level',
'Model node size': 'Model node size',
'Cluster node size': 'Cluster node size',
'Node slot size': 'Node slot size',
'Publishing model unit count': 'Publishing model unit count',
'Model slot size': 'Model slot size',
'Association slot size': 'Association slot size',
'Client script count': 'Client script count',
'Server script count': 'Server script count',
'Information slot count': 'Information slot count',
'Association center slot count': 'Association center slot count',
'Collection center slot count': 'Collection center slot count',
'Component slot count': 'Component slot count',
'Semantic association count': 'Semantic association count',
'Segment count': 'Segment count',
'Slot count': 'Slot count',
'Cluster slot count': 'Cluster slot count',
'Cluster count': 'Cluster count',
'Publishing unit count': 'Publishing unit count',
'Section count': 'Section count',
'Inner/sub concern count': 'Inner/sub concern count',
'Indifferent concern count': 'Indifferent concern count',
'Module point cut count': 'Module point cut count',
'Module count': 'Module count',
'Module attribute count': 'Module attribute count',
'Operation count': 'Operation count',
'Comment count': 'Comment count',
'Reused comment count': 'Reused comment count',
'Media duration': 'Media duration',
'Diffusion cut count': 'Diffusion cut count',
'Concern module count': 'Concern module count',
'Concern operation count': 'Concern operation count',
'Anchor count': 'Anchor count'},
'Functionality': {
'High feature count': 'High feature count',
'Low feature count': 'Low feature count',
'Reused high feature count': 'Reused high feature count',
'Reused low feature count': 'Reused low feature count',
'Web objects': 'Web objects',
'Common Software Measurement International Consortium': 'Common Software Measurement International Consortium',
'International Function Point Users Group': 'International Function Point Users Group',
'Object-Oriented Heuristic Function Points': 'Object-Oriented Heuristic Function Points',
'Object-Oriented Function Points': 'Object-Oriented Function Points',
'Use case count': 'Use case count',
'Feature count': 'Feature count',
'Data Web points': 'Data Web points'},
'Object-oriented': {
'Cohesion': 'Cohesion',
'Class coupling': 'Class coupling',
'Concern coupling': 'Concern coupling'},
'Complexity': {
'Connectivity density': 'Connectivity density',
'Cyclomatic complexity': 'Cyclomatic complexity',
'Model collection complexity': 'Model collection complexity',
'Model association complexity': 'Model association complexity',
'Model link complexity': 'Model link complexity',
'Page complexity': 'Page complexity',
'Component complexity': 'Component complexity',
'Total complexity': 'Total complexity',
'Adaptation complexity': 'Adaptation complexity',
'New complexity': 'New complexity',
'Data usage complexity': 'Data usage complexity',
'Data flow complexity': 'Data flow complexity',
'Cohesion complexity': 'Cohesion complexity',
'Interface complexity': 'Interface complexity',
'Control flow complexity': 'Control flow complexity',
'Class complexity': 'Class complexity',
'Layout complexity': 'Layout complexity',
'Input complexity': 'Input complexity',
'Output complexity': 'Output complexity'}
},
'Cost Driver': {
'Product':{
'Type': 'Type',
'Stratum': 'Stratum',
'Compactness': 'Compactness',
'Structure': 'Structure',
'Architecture': 'Architecture',
'Integration with legacy systems': 'Integration with legacy systems',
'Concurrency level': 'Concurrency level',
'Processing requirements': 'Processing requirements',
'Database size': 'Database size',
'Requirements volatility level': 'Requirements volatility level',
'Requirements novelty level': 'Requirements novelty level',
'Reliability level': 'Reliability level',
'Maintainability level': 'Maintainability level',
'Time efficiency level': 'Time efficiency level',
'Memory efficiency level': 'Memory efficiency level',
'Portability level': 'Portability level',
'Scalability level': 'Scalability level',
'Quality level': 'Quality level',
'Usability level': 'Usability level',
'Readability level': 'Readability level',
'Security level': 'Security level',
'Installability level': 'Installability level',
'Modularity level': 'Modularity level',
'Flexibility level': 'Flexibility level',
'Testability level': 'Testability level',
'Accessibility level': 'Accessibility level',
'Trainability level': 'Trainability level',
'Innovation level': 'Innovation level',
'Technical factors': 'Technical factors',
'Storage constraint': 'Storage constraint',
'Reusability level': 'Reusability level',
'Robustness level': 'Robustness level',
'Design volatility': 'Design volatility',
'Experience level': 'Experience level',
'Requirements clarity level': 'Requirements clarity level'},
'Client': {
'Availability level': 'Availability level',
'IT literacy': 'IT literacy',
'Mapped workflows': 'Mapped workflows',
'Personality': 'Personality'},
'Development Company': {
'SPI program': 'SPI program',
'Metrics’ program': 'Metrics’ program',
'Number of projects in parallel': 'Number of projects in parallel',
'Software reuse': 'Software reuse'},
'Project': {
'Documentation level': 'Documentation level',
'Number of programming languages': 'Number of programming languages',
'Type': 'Type',
'Process efficiency level': 'Process efficiency level',
'Project management level': 'Project management level',
'Infrastructure': 'Infrastructure',
'Development restriction': 'Development restriction',
'Time restriction': 'Time restriction',
'Risk level': 'Risk level',
'Rapid app development': 'Rapid app development',
'Operational mode': 'Operational mode',
'Resource level': 'Resource level',
'Lessons learned repository': 'Lessons learned repository'},
'Team': {
'Domain experience level': 'Domain experience level',
'Team size': 'Team size',
'Deployment platform experience level': 'Deployment platform experience level',
'Team capability': 'Team capability',
'Programming language experience level': 'Programming language experience level',
'Tool experience level': 'Tool experience level',
'Communication level': 'Communication level',
'Software development experience': 'Software development experience',
'Work Team level': 'Work Team level',
'Stability level': 'Stability level',
'Motivation level': 'Motivation level',
'Focus factor': 'Focus factor',
'Tool experience level': 'Tool experience level',
'OO experience level': 'OO experience level',
'In-house experience': 'In-house experience'},
'Technology': {
'Authoring tool type': 'Authoring tool type',
'Productivity level': 'Productivity level',
'Novelty level': 'Novelty level',
'Platform volatility level': 'Platform volatility level',
'Difficulty level': 'Difficulty level',
'Platform support level': 'Platform support level'}}
}
}
leaves = get_leaf_nodes(new_taxonomy)
print(leaves)
## {'Web page count': 'Web page count', 'Media count': 'Media count', 'New media count': 'New media count', 'New Web page count': 'New Web page count', 'Link count': 'Link count', 'Program count': 'Program count', 'Reused component count': 'Reused component count', 'Lines of code': 'Lines of code', 'Reused program count': 'Reused program count', 'Reused media count': 'Reused media count', 'Web page allocation': 'Web page allocation', 'Reused lines of code': 'Reused lines of code', 'Media allocation': 'Media allocation', 'Reused media allocation': 'Reused media allocation', 'Entity count': 'Entity count', 'Attribute count': 'Attribute count', 'Component count': 'Component count', 'Statement count': 'Statement count', 'Node count': 'Node count', 'Collection slot size': 'Collection slot size', 'Component granularity level': 'Component granularity level', 'Slot granularity level': 'Slot granularity level', 'Model node size': 'Model node size', 'Cluster node size': 'Cluster node size', 'Node slot size': 'Node slot size', 'Publishing model unit count': 'Publishing model unit count', 'Model slot size': 'Model slot size', 'Association slot size': 'Association slot size', 'Client script count': 'Client script count', 'Server script count': 'Server script count', 'Information slot count': 'Information slot count', 'Association center slot count': 'Association center slot count', 'Collection center slot count': 'Collection center slot count', 'Component slot count': 'Component slot count', 'Semantic association count': 'Semantic association count', 'Segment count': 'Segment count', 'Slot count': 'Slot count', 'Cluster slot count': 'Cluster slot count', 'Cluster count': 'Cluster count', 'Publishing unit count': 'Publishing unit count', 'Section count': 'Section count', 'Inner/sub concern count': 'Inner/sub concern count', 'Indifferent concern count': 'Indifferent concern count', 'Module point cut count': 'Module point cut count', 'Module count': 'Module count', 'Module attribute count': 'Module attribute count', 'Operation count': 'Operation count', 'Comment count': 'Comment count', 'Reused comment count': 'Reused comment count', 'Media duration': 'Media duration', 'Diffusion cut count': 'Diffusion cut count', 'Concern module count': 'Concern module count', 'Concern operation count': 'Concern operation count', 'Anchor count': 'Anchor count', 'High feature count': 'High feature count', 'Low feature count': 'Low feature count', 'Reused high feature count': 'Reused high feature count', 'Reused low feature count': 'Reused low feature count', 'Web objects': 'Web objects', 'Common Software Measurement International Consortium': 'Common Software Measurement International Consortium', 'International Function Point Users Group': 'International Function Point Users Group', 'Object-Oriented Heuristic Function Points': 'Object-Oriented Heuristic Function Points', 'Object-Oriented Function Points': 'Object-Oriented Function Points', 'Use case count': 'Use case count', 'Feature count': 'Feature count', 'Data Web points': 'Data Web points', 'Cohesion': 'Cohesion', 'Class coupling': 'Class coupling', 'Concern coupling': 'Concern coupling', 'Connectivity density': 'Connectivity density', 'Cyclomatic complexity': 'Cyclomatic complexity', 'Model collection complexity': 'Model collection complexity', 'Model association complexity': 'Model association complexity', 'Model link complexity': 'Model link complexity', 'Page complexity': 'Page complexity', 'Component complexity': 'Component complexity', 'Total complexity': 'Total complexity', 'Adaptation complexity': 'Adaptation complexity', 'New complexity': 'New complexity', 'Data usage complexity': 'Data usage complexity', 'Data flow complexity': 'Data flow complexity', 'Cohesion complexity': 'Cohesion complexity', 'Interface complexity': 'Interface complexity', 'Control flow complexity': 'Control flow complexity', 'Class complexity': 'Class complexity', 'Layout complexity': 'Layout complexity', 'Input complexity': 'Input complexity', 'Output complexity': 'Output complexity', 'Type': 'Type', 'Stratum': 'Stratum', 'Compactness': 'Compactness', 'Structure': 'Structure', 'Architecture': 'Architecture', 'Integration with legacy systems': 'Integration with legacy systems', 'Concurrency level': 'Concurrency level', 'Processing requirements': 'Processing requirements', 'Database size': 'Database size', 'Requirements volatility level': 'Requirements volatility level', 'Requirements novelty level': 'Requirements novelty level', 'Reliability level': 'Reliability level', 'Maintainability level': 'Maintainability level', 'Time efficiency level': 'Time efficiency level', 'Memory efficiency level': 'Memory efficiency level', 'Portability level': 'Portability level', 'Scalability level': 'Scalability level', 'Quality level': 'Quality level', 'Usability level': 'Usability level', 'Readability level': 'Readability level', 'Security level': 'Security level', 'Installability level': 'Installability level', 'Modularity level': 'Modularity level', 'Flexibility level': 'Flexibility level', 'Testability level': 'Testability level', 'Accessibility level': 'Accessibility level', 'Trainability level': 'Trainability level', 'Innovation level': 'Innovation level', 'Technical factors': 'Technical factors', 'Storage constraint': 'Storage constraint', 'Reusability level': 'Reusability level', 'Robustness level': 'Robustness level', 'Design volatility': 'Design volatility', 'Experience level': 'Experience level', 'Requirements clarity level': 'Requirements clarity level', 'Availability level': 'Availability level', 'IT literacy': 'IT literacy', 'Mapped workflows': 'Mapped workflows', 'Personality': 'Personality', 'SPI program': 'SPI program', 'Metrics’ program': 'Metrics’ program', 'Number of projects in parallel': 'Number of projects in parallel', 'Software reuse': 'Software reuse', 'Documentation level': 'Documentation level', 'Number of programming languages': 'Number of programming languages', 'Process efficiency level': 'Process efficiency level', 'Project management level': 'Project management level', 'Infrastructure': 'Infrastructure', 'Development restriction': 'Development restriction', 'Time restriction': 'Time restriction', 'Risk level': 'Risk level', 'Rapid app development': 'Rapid app development', 'Operational mode': 'Operational mode', 'Resource level': 'Resource level', 'Lessons learned repository': 'Lessons learned repository', 'Domain experience level': 'Domain experience level', 'Team size': 'Team size', 'Deployment platform experience level': 'Deployment platform experience level', 'Team capability': 'Team capability', 'Programming language experience level': 'Programming language experience level', 'Tool experience level': 'Tool experience level', 'Communication level': 'Communication level', 'Software development experience': 'Software development experience', 'Work Team level': 'Work Team level', 'Stability level': 'Stability level', 'Motivation level': 'Motivation level', 'Focus factor': 'Focus factor', 'OO experience level': 'OO experience level', 'In-house experience': 'In-house experience', 'Authoring tool type': 'Authoring tool type', 'Productivity level': 'Productivity level', 'Novelty level': 'Novelty level', 'Platform volatility level': 'Platform volatility level', 'Difficulty level': 'Difficulty level', 'Platform support level': 'Platform support level'}
ncat = extract_ncat(new_taxonomy)
## Found category: ['Length', 'Functionality', 'Object-oriented', 'Complexity']
## Found category: ['Web page count', 'Media count', 'New media count', 'New Web page count', 'Link count', 'Program count', 'Reused component count', 'Lines of code', 'Reused program count', 'Reused media count', 'Web page allocation', 'Reused lines of code', 'Media allocation', 'Reused media allocation', 'Entity count', 'Attribute count', 'Component count', 'Statement count', 'Node count', 'Collection slot size', 'Component granularity level', 'Slot granularity level', 'Model node size', 'Cluster node size', 'Node slot size', 'Publishing model unit count', 'Model slot size', 'Association slot size', 'Client script count', 'Server script count', 'Information slot count', 'Association center slot count', 'Collection center slot count', 'Component slot count', 'Semantic association count', 'Segment count', 'Slot count', 'Cluster slot count', 'Cluster count', 'Publishing unit count', 'Section count', 'Inner/sub concern count', 'Indifferent concern count', 'Module point cut count', 'Module count', 'Module attribute count', 'Operation count', 'Comment count', 'Reused comment count', 'Media duration', 'Diffusion cut count', 'Concern module count', 'Concern operation count', 'Anchor count']
## Found category: ['High feature count', 'Low feature count', 'Reused high feature count', 'Reused low feature count', 'Web objects', 'Common Software Measurement International Consortium', 'International Function Point Users Group', 'Object-Oriented Heuristic Function Points', 'Object-Oriented Function Points', 'Use case count', 'Feature count', 'Data Web points']
## Found category: ['Cohesion', 'Class coupling', 'Concern coupling']
## Found category: ['Connectivity density', 'Cyclomatic complexity', 'Model collection complexity', 'Model association complexity', 'Model link complexity', 'Page complexity', 'Component complexity', 'Total complexity', 'Adaptation complexity', 'New complexity', 'Data usage complexity', 'Data flow complexity', 'Cohesion complexity', 'Interface complexity', 'Control flow complexity', 'Class complexity', 'Layout complexity', 'Input complexity', 'Output complexity']
## Found category: ['Product', 'Client', 'Development Company', 'Project', 'Team', 'Technology']
## Found category: ['Type', 'Stratum', 'Compactness', 'Structure', 'Architecture', 'Integration with legacy systems', 'Concurrency level', 'Processing requirements', 'Database size', 'Requirements volatility level', 'Requirements novelty level', 'Reliability level', 'Maintainability level', 'Time efficiency level', 'Memory efficiency level', 'Portability level', 'Scalability level', 'Quality level', 'Usability level', 'Readability level', 'Security level', 'Installability level', 'Modularity level', 'Flexibility level', 'Testability level', 'Accessibility level', 'Trainability level', 'Innovation level', 'Technical factors', 'Storage constraint', 'Reusability level', 'Robustness level', 'Design volatility', 'Experience level', 'Requirements clarity level']
## Found category: ['Availability level', 'IT literacy', 'Mapped workflows', 'Personality']
## Found category: ['SPI program', 'Metrics’ program', 'Number of projects in parallel', 'Software reuse']
## Found category: ['Documentation level', 'Number of programming languages', 'Type', 'Process efficiency level', 'Project management level', 'Infrastructure', 'Development restriction', 'Time restriction', 'Risk level', 'Rapid app development', 'Operational mode', 'Resource level', 'Lessons learned repository']
## Found category: ['Domain experience level', 'Team size', 'Deployment platform experience level', 'Team capability', 'Programming language experience level', 'Tool experience level', 'Communication level', 'Software development experience', 'Work Team level', 'Stability level', 'Motivation level', 'Focus factor', 'OO experience level', 'In-house experience']
## Found category: ['Authoring tool type', 'Productivity level', 'Novelty level', 'Platform volatility level', 'Difficulty level', 'Platform support level']
nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy)
depths_char = extract_depths_char(new_taxonomy)
print("Number of categories (ncat):", ncat)
## Number of categories (ncat): 12
print("Number of characteristics (nchar):", nchar)
## Number of characteristics (nchar): 164
print("Depths of categories:", depths_cat)
## Depths of categories: [0, 1, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3]
print("Depths of characteristics:", depths_char)
## Depths of characteristics: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
robustness_value = calculate_r_t(new_taxonomy)
print(f"Robustness R(T): {robustness_value:.4f}")
## Robustness R(T): 0.8751
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')
## The conciseness of the taxonomy is: 0.20772562349345713
new_taxonomy = {
'GSE': {
'Project': {
'Site': {
"Location": "Location",
"Legal Entity": "Legal Entity",
"Geographic Distance": "Geographic Distance",
"Temporal Distance": "Temporal Distance",
"Estimation stage": {
"Early": "Early",
"Early & Late": "Early & Late",
"Late": "Late"
},
"Estimation process role": {
"Estimator": "Estimator",
"Estimator & Provider": "Estimator & Provider",
"Provider": "Provider"
}
},
'Relationship': {
"Location": "Location",
"Legal Entity": "Legal Entity",
"Geographic Distance": "Geographic Distance",
"Temporal Distance": "Temporal Distance",
"Estimation process architectural model": {
"Centralized": "Centralized",
"Distributed": "Distributed",
"Semi-distributed": "Semi-distributed"
}
}
}
}
}
leaves = get_leaf_nodes(new_taxonomy)
print(leaves)
## {'Location': 'Location', 'Legal Entity': 'Legal Entity', 'Geographic Distance': 'Geographic Distance', 'Temporal Distance': 'Temporal Distance', 'Early': 'Early', 'Early & Late': 'Early & Late', 'Late': 'Late', 'Estimator': 'Estimator', 'Estimator & Provider': 'Estimator & Provider', 'Provider': 'Provider', 'Centralized': 'Centralized', 'Distributed': 'Distributed', 'Semi-distributed': 'Semi-distributed'}
ncat = extract_ncat(new_taxonomy)
## Found category: ['Site', 'Relationship']
## Found category: ['Location', 'Legal Entity', 'Geographic Distance', 'Temporal Distance', 'Estimation stage', 'Estimation process role']
## Found category: ['Early', 'Early & Late', 'Late']
## Found category: ['Estimator', 'Estimator & Provider', 'Provider']
## Found category: ['Location', 'Legal Entity', 'Geographic Distance', 'Temporal Distance', 'Estimation process architectural model']
## Found category: ['Centralized', 'Distributed', 'Semi-distributed']
nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy)
depths_char = extract_depths_char(new_taxonomy)
print("Number of categories (ncat):", ncat)
## Number of categories (ncat): 6
print("Number of characteristics (nchar):", nchar)
## Number of characteristics (nchar): 17
print("Depths of categories:", depths_cat)
## Depths of categories: [0, 1, 2, 3, 4, 4, 3, 4]
print("Depths of characteristics:", depths_char)
## Depths of characteristics: [4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5]
robustness_value = calculate_r_t(new_taxonomy)
print(f"Robustness R(T): {robustness_value:.4f}")
## Robustness R(T): 0.7532
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')
## The conciseness of the taxonomy is: 0.3645130659482384
new_taxonomy = {
'Software estimation': {
'Basic Estimating Methods': {
"Algorithmic": {
"Constructive Cost Model": "Constructive Cost Model",
"Software Life Cycle Management": "Software Life Cycle Management",
"Software Evaluation and Estimation for Risk": "Software Evaluation and Estimation for Risk"
},
"Non-Algorithmic": {
"Expert Judgment": "Expert Judgment", # Corrected spelling
"Analogy-Based": "Analogy-Based"
}
},
'Combined Estimating Methods': {
"Basic-Combination": "Basic-Combination",
"Legal Entity": "Legal Entity",
"Estimation process architectural model": {
"Fuzzy Logic": "Fuzzy Logic",
"Artificial Neural Networks": "Artificial Neural Networks",
"Computational Intelligence": { # Corrected spelling
"swarm": "swarm",
"evolutionary": ""
}
},
"AI-Combined hybrid": "AI-Combined hybrid"
}
}
}
leaves = get_leaf_nodes(new_taxonomy)
print(leaves)
## {'Constructive Cost Model': 'Constructive Cost Model', 'Software Life Cycle Management': 'Software Life Cycle Management', 'Software Evaluation and Estimation for Risk': 'Software Evaluation and Estimation for Risk', 'Expert Judgment': 'Expert Judgment', 'Analogy-Based': 'Analogy-Based', 'Basic-Combination': 'Basic-Combination', 'Legal Entity': 'Legal Entity', 'Fuzzy Logic': 'Fuzzy Logic', 'Artificial Neural Networks': 'Artificial Neural Networks', 'swarm': 'swarm', 'evolutionary': '', 'AI-Combined hybrid': 'AI-Combined hybrid'}
ncat = extract_ncat(new_taxonomy)
## Found category: ['Algorithmic', 'Non-Algorithmic']
## Found category: ['Constructive Cost Model', 'Software Life Cycle Management', 'Software Evaluation and Estimation for Risk']
## Found category: ['Expert Judgment', 'Analogy-Based']
## Found category: ['Basic-Combination', 'Legal Entity', 'Estimation process architectural model', 'AI-Combined hybrid']
## Found category: ['Fuzzy Logic', 'Artificial Neural Networks', 'Computational Intelligence']
## Found category: ['swarm', 'evolutionary']
nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy)
depths_char = extract_depths_char(new_taxonomy)
print("Number of categories (ncat):", ncat)
## Number of categories (ncat): 6
print("Number of characteristics (nchar):", nchar)
## Number of characteristics (nchar): 12
print("Depths of categories:", depths_cat)
## Depths of categories: [0, 1, 2, 3, 3, 2, 3, 4]
print("Depths of characteristics:", depths_char)
## Depths of characteristics: [4, 4, 4, 4, 4, 3, 3, 4, 4, 5, 5, 3]
robustness_value = calculate_r_t(new_taxonomy)
print(f"Robustness R(T): {robustness_value:.4f}")
## Robustness R(T): 0.8667
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')
## The conciseness of the taxonomy is: 0.3722455291575357
new_taxonomy = {
"Hypermedia and Web Application Size Metrics":{
"Motivation":"Motivation",
"Harvesting time":{
"Early size metric":"Early size metric",
"Late size metric":"Late size metric"},
"Metric foundation":{
"Problem-oriented metric":"Problem-oriented metric",
"Solution-oriented metric":"Solution-oriented metric"},
"Class":{
"Length":"Length",
"Functionality":"Functionality",
"Complexity":"Complexity"},
"Entity":{
"Web hypermedia application":"Web hypermedia application",
"Web software application":"Web software application",
"Web application":"Web application",
"Media":"Media",
"Program/Script":"Program/Sript"},
"Measurement Scale":{
"Nominal":"Nominal",
"Ordinal":"Ordinal",
"Interval":"Interval",
"Ratio":"Ratio",
"Absolute":"Absolute"},
"Computation":{
"Direct":"Direct",
"Indirect":"Indirect"},
"Validation":{
"Validated Empirically":"Validated Empirically",
"Validated Theoretically":"Validated Theoretically",
"Both":"Both",
"None":"None"},
"Model dependency":{
"Specific":"Specific",
"Nonspecific":"Nonspecific"}
}
}
leaves = get_leaf_nodes(new_taxonomy)
print(leaves)
## {'Motivation': 'Motivation', 'Early size metric': 'Early size metric', 'Late size metric': 'Late size metric', 'Problem-oriented metric': 'Problem-oriented metric', 'Solution-oriented metric': 'Solution-oriented metric', 'Length': 'Length', 'Functionality': 'Functionality', 'Complexity': 'Complexity', 'Web hypermedia application': 'Web hypermedia application', 'Web software application': 'Web software application', 'Web application': 'Web application', 'Media': 'Media', 'Program/Script': 'Program/Sript', 'Nominal': 'Nominal', 'Ordinal': 'Ordinal', 'Interval': 'Interval', 'Ratio': 'Ratio', 'Absolute': 'Absolute', 'Direct': 'Direct', 'Indirect': 'Indirect', 'Validated Empirically': 'Validated Empirically', 'Validated Theoretically': 'Validated Theoretically', 'Both': 'Both', 'None': 'None', 'Specific': 'Specific', 'Nonspecific': 'Nonspecific'}
ncat = extract_ncat(new_taxonomy)
## Found category: ['Early size metric', 'Late size metric']
## Found category: ['Problem-oriented metric', 'Solution-oriented metric']
## Found category: ['Length', 'Functionality', 'Complexity']
## Found category: ['Web hypermedia application', 'Web software application', 'Web application', 'Media', 'Program/Script']
## Found category: ['Nominal', 'Ordinal', 'Interval', 'Ratio', 'Absolute']
## Found category: ['Direct', 'Indirect']
## Found category: ['Validated Empirically', 'Validated Theoretically', 'Both', 'None']
## Found category: ['Specific', 'Nonspecific']
nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy)
depths_char = extract_depths_char(new_taxonomy)
print("Number of categories (ncat):", ncat)
## Number of categories (ncat): 8
print("Number of characteristics (nchar):", nchar)
## Number of characteristics (nchar): 26
print("Depths of categories:", depths_cat)
## Depths of categories: [0, 1, 2, 2, 2, 2, 2, 2, 2, 2]
print("Depths of characteristics:", depths_char)
## Depths of characteristics: [2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
robustness_value = calculate_r_t(new_taxonomy)
print(f"Robustness R(T): {robustness_value:.4f}")
## Robustness R(T): 0.8958
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')
## The conciseness of the taxonomy is: 0.281527889373394
new_taxonomy = {
'Effort Estimation in ASD': {
'Estimation context': {
"Planning level": {
"Release": "Release",
"Sprint": "Sprint",
"Daily": "Daily",
"Bidding": "Bidding"
},
"Estimated activities": {
"Analysis": "Analysis",
"Design": "Design",
"Implementation": "Implementation",
"Testing": "Testing",
"Maintenance": "Maintenance",
"All": "All"
},
"Agile methods": {
"Extreme Programming": "Extreme Programming",
"Scrum": "Scrum",
"Customized Extreme Programming": "Customized Extreme Programming",
"Customized Scrum": "Customized Scrum",
"Dynamic Systems Development Method": "Dynamic Systems Development Method",
"Crystal": "Crystal",
"Feature-Driven Development": "Feature-Driven Development",
"Kanban": "Kanban"
},
"Project domain": {
"Communications industry": "Communications industry",
"Transportation": "Transportation",
"Financial": "Financial",
"Education": "Education",
"Health": "Health",
"Retail/Wholesale": "Retail/Wholesale",
"Manufacturing": "Manufacturing",
"Government/Military": "Government/Military",
"Other": "Other"
},
"Project setting": {
"Co-located": "Co-located",
"Distributed: Close Onshore": "Distributed: Close Onshore",
"Distributed: Distant Onshore": "Distributed: Distant Onshore",
"Distributed: Near Offshore": "Distributed: Near Offshore",
"Distributed: Far Offshore": "Distributed: Far Offshore"
},
"Estimation entity": {
"User story": "User story",
"Task": "Task",
"Use case": "Use case",
"Other": "Other"
},
"Number of entities estimated": {
"Value": "Value"
},
"Team size": {
"No. of team members": "No. of team members"
}
},
'Estimation technique': {
"Estimation Techniques": {
"Planning Poker": "Planning Poker",
"Expert Judgement": "Expert Judgement",
"Analogy": "Analogy",
"Use case points method": "Use case points method",
"Other": "Other"
},
"Type": {
"Single": "Single",
"Group": "Group"
}
},
'Effort predictors': {
"Size": {
"Story points": "Story points",
"User case points": "User case points",
"Function points": "Function points",
"Other": "Other",
"Not used": "Not used",
"Considered without any metric": "Considered without any metric"
},
"Team's prior experience": {
"Considered": "Considered",
"Not Considered": "Not Considered"
},
"Team's skill level": {
"Considered": "Considered",
"Not Considered": "Not Considered"
},
"Non functional requirements": {
"Performance": "Performance",
"Security": "Security",
"Availability": "Availability",
"Reliability": "Reliability",
"Maintainability": "Maintainability",
"Other": "Other", # Changed period to comma
"Not considered": "Not considered"
},
"Distributed teams' issues": {
"Considered": "Considered",
"Not Considered": "Not Considered",
"Not applicable": "Not applicable"
},
"Customer Communication": {
"Considered": "Considered",
"Not Considered": "Not Considered"
}
},
'Effort estimate': {
"Estimated effort": {
"Estimate value(s)": "Estimate value(s)"
},
"Actual effort": {
"Value": "Value"
},
"Type": {
"Point": "Point",
"Three point": "Three point",
"Distribution": "Distribution",
"Other": "Other"
},
"Unit": {
"House/days": "House/days",
"Pair days": "Pair/days",
"Ideal hours": "Ideal hours",
"Other": "Other"
},
"Accuracy Level": {
"Value": "Value"
},
"Accuracy measure": {
"Mean Magnitude of Relative Error": "Mean Magnitude of Relative Error",
"Median Magnitude of Relative Error": "Median Magnitude of Relative Error",
"Bias of Relative Error": "Bias of Relative Error",
"Other": "Other",
"Not used": "Not used"
}
}
}
}
leaves = get_leaf_nodes(new_taxonomy)
print(leaves)
## {'Release': 'Release', 'Sprint': 'Sprint', 'Daily': 'Daily', 'Bidding': 'Bidding', 'Analysis': 'Analysis', 'Design': 'Design', 'Implementation': 'Implementation', 'Testing': 'Testing', 'Maintenance': 'Maintenance', 'All': 'All', 'Extreme Programming': 'Extreme Programming', 'Scrum': 'Scrum', 'Customized Extreme Programming': 'Customized Extreme Programming', 'Customized Scrum': 'Customized Scrum', 'Dynamic Systems Development Method': 'Dynamic Systems Development Method', 'Crystal': 'Crystal', 'Feature-Driven Development': 'Feature-Driven Development', 'Kanban': 'Kanban', 'Communications industry': 'Communications industry', 'Transportation': 'Transportation', 'Financial': 'Financial', 'Education': 'Education', 'Health': 'Health', 'Retail/Wholesale': 'Retail/Wholesale', 'Manufacturing': 'Manufacturing', 'Government/Military': 'Government/Military', 'Other': 'Other', 'Co-located': 'Co-located', 'Distributed: Close Onshore': 'Distributed: Close Onshore', 'Distributed: Distant Onshore': 'Distributed: Distant Onshore', 'Distributed: Near Offshore': 'Distributed: Near Offshore', 'Distributed: Far Offshore': 'Distributed: Far Offshore', 'User story': 'User story', 'Task': 'Task', 'Use case': 'Use case', 'Value': 'Value', 'No. of team members': 'No. of team members', 'Planning Poker': 'Planning Poker', 'Expert Judgement': 'Expert Judgement', 'Analogy': 'Analogy', 'Use case points method': 'Use case points method', 'Single': 'Single', 'Group': 'Group', 'Story points': 'Story points', 'User case points': 'User case points', 'Function points': 'Function points', 'Not used': 'Not used', 'Considered without any metric': 'Considered without any metric', 'Considered': 'Considered', 'Not Considered': 'Not Considered', 'Performance': 'Performance', 'Security': 'Security', 'Availability': 'Availability', 'Reliability': 'Reliability', 'Maintainability': 'Maintainability', 'Not considered': 'Not considered', 'Not applicable': 'Not applicable', 'Estimate value(s)': 'Estimate value(s)', 'Point': 'Point', 'Three point': 'Three point', 'Distribution': 'Distribution', 'House/days': 'House/days', 'Pair days': 'Pair/days', 'Ideal hours': 'Ideal hours', 'Mean Magnitude of Relative Error': 'Mean Magnitude of Relative Error', 'Median Magnitude of Relative Error': 'Median Magnitude of Relative Error', 'Bias of Relative Error': 'Bias of Relative Error'}
ncat = extract_ncat(new_taxonomy)
## Found category: ['Planning level', 'Estimated activities', 'Agile methods', 'Project domain', 'Project setting', 'Estimation entity', 'Number of entities estimated', 'Team size']
## Found category: ['Release', 'Sprint', 'Daily', 'Bidding']
## Found category: ['Analysis', 'Design', 'Implementation', 'Testing', 'Maintenance', 'All']
## Found category: ['Extreme Programming', 'Scrum', 'Customized Extreme Programming', 'Customized Scrum', 'Dynamic Systems Development Method', 'Crystal', 'Feature-Driven Development', 'Kanban']
## Found category: ['Communications industry', 'Transportation', 'Financial', 'Education', 'Health', 'Retail/Wholesale', 'Manufacturing', 'Government/Military', 'Other']
## Found category: ['Co-located', 'Distributed: Close Onshore', 'Distributed: Distant Onshore', 'Distributed: Near Offshore', 'Distributed: Far Offshore']
## Found category: ['User story', 'Task', 'Use case', 'Other']
## Found category: ['Value']
## Found category: ['No. of team members']
## Found category: ['Estimation Techniques', 'Type']
## Found category: ['Planning Poker', 'Expert Judgement', 'Analogy', 'Use case points method', 'Other']
## Found category: ['Single', 'Group']
## Found category: ['Size', "Team's prior experience", "Team's skill level", 'Non functional requirements', "Distributed teams' issues", 'Customer Communication']
## Found category: ['Story points', 'User case points', 'Function points', 'Other', 'Not used', 'Considered without any metric']
## Found category: ['Considered', 'Not Considered']
## Found category: ['Considered', 'Not Considered']
## Found category: ['Performance', 'Security', 'Availability', 'Reliability', 'Maintainability', 'Other', 'Not considered']
## Found category: ['Considered', 'Not Considered', 'Not applicable']
## Found category: ['Considered', 'Not Considered']
## Found category: ['Estimated effort', 'Actual effort', 'Type', 'Unit', 'Accuracy Level', 'Accuracy measure']
## Found category: ['Estimate value(s)']
## Found category: ['Value']
## Found category: ['Point', 'Three point', 'Distribution', 'Other']
## Found category: ['House/days', 'Pair days', 'Ideal hours', 'Other']
## Found category: ['Value']
## Found category: ['Mean Magnitude of Relative Error', 'Median Magnitude of Relative Error', 'Bias of Relative Error', 'Other', 'Not used']
nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy)
depths_char = extract_depths_char(new_taxonomy)
print("Number of categories (ncat):", ncat)
## Number of categories (ncat): 26
print("Number of characteristics (nchar):", nchar)
## Number of characteristics (nchar): 83
print("Depths of categories:", depths_cat)
## Depths of categories: [0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3]
print("Depths of characteristics:", depths_char)
## Depths of characteristics: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
robustness_value = calculate_r_t(new_taxonomy)
print(f"Robustness R(T): {robustness_value:.4f}")
## Robustness R(T): 0.8267
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')
## The conciseness of the taxonomy is: 0.22706778267599811
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D # Import for 3D plotting
from transformers import AutoTokenizer, AutoModel
import torch
import matplotlib
plt.clf()
plt.style.use('seaborn-v0_8-whitegrid') # You can change this to any available style
plt.rcParams['font.family'] = 'serif'
# Step 1: Define the sets
# Combine the sets into a dictionary
sets = {
'Bajta': {"Agile", "Analysis", "Availability", "Baseline comparison", "Bidding", "CBR", "CMMI", "COCOMO", "Commissioning", "Conceptualization", "Delphi", "Detail planning", "Design", "Distant onshore", "Expert judgment", "Estimated value", "Execution", "Effort hours", "Feasibility study", "Finance", "Fuzzy similarity", "GA", "Group-based estimation", "Healthcare", "Hardware", "Implementation", "Individual", "Machine learning", "Maintainability", "Maintenance", "Near offshore", "Non-machine learning", "Not considered", "Number of team members", "Performance", "Portfolio", "Preliminary planning", "Reliability", "Research & development", "Risk", "Security", "Sensitivity analysis", "Size report", "Socio-cultural distance", "Statistical analysis", "Staff/cost", "System investigation", "Temporal distance", "Testing", "Value", "Variation reduction"},
'Britto_2017': {"Accessibility level", "Adaptation complexity", "Anchor count", "Architecture", "Association center slot count", "Association slot size", "Attribute count", "Authoring tool type", "Availability level", "Class complexity", "Class coupling", "Client script count", "Cluster count", "Cluster node size", "Cluster slot count", "Cohesion", "Cohesion complexity", "Collection center slot count", "Collection slot size", "Comment count", "Communication level", "Compactness", "Component complexity", "Component count", "Component granularity level", "Component slot count", "Concern coupling", "Concern module count", "Concern operation count", "Concurrency level", "Connectivity density", "Control flow complexity", "Cyclomatic complexity", "Data Web points", "Data flow complexity", "Data usage complexity", "Database size", "Deployment platform experience level", "Design volatility", "Development restriction", "Difficulty level", "Diffusion cut count", "Documentation level", "Domain experience level", "Entity count", "Experience level", "Feature count", "Flexibility level", "Focus factor", "High feature count", "IT literacy", "In-house experience", "Indifferent concern count", "Information slot count", "Infrastructure", "Inner/sub concern count", "Innovation level", "Input complexity", "Installability level", "Integration with legacy systems", "Interface complexity", "International Function Point Users Group", "Layout complexity", "Lessons learned repository", "Lines of code", "Link count", "Low feature count", "Maintainability level", "Mapped workflows", "Media allocation", "Media count", "Media duration", "Memory efficiency level", "Metrics program", "Model association complexity", "Model collection complexity", "Model link complexity", "Model node size", "Model slot size", "Modularity level", "Module attribute count", "Module count", "Module point cut count", "Motivation level", "New Web page count", "New complexity", "New media count", "Node count", "Node slot size", "Novelty level", "Number of programming languages", "Number of projects in parallel", "OO experience level", "Object-Oriented Function Points", "Operation count", "Operational mode", "Output complexity", "Page complexity", "Personality", "Platform support level", "Platform volatility level", "Portability level", "Process efficiency level", "Processing requirements", "Productivity level", "Program count", "Programming language experience level", "Project management level", "Publishing model unit count", "Publishing unit count", "Quality level", "Rapid app development", "Readability level", "Reliability level", "Requirements clarity level", "Requirements novelty level", "Requirements volatility level", "Resource level", "Reusability level", "Reused comment count", "Reused component count", "Reused high feature count", "Reused lines of code", "Reused low feature count", "Reused media allocation", "Reused media count", "Reused program count", "Risk level", "Robustness level", "SPI program", "Scalability level", "Section count", "Security level", "Segment count", "Semantic association count", "Server script count", "Slot count", "Slot granularity level", "Software development experience", "Software reuse", "Stability level", "Statement count", "Storage constraint", "Structure", "Team capability", "Team size", "Technical factors", "Testability level", "Time efficiency level", "Time restriction", "Tool experience level", "Total complexity", "Trainability level", "Type", "Usability level", "Use case count", "Web objects", "Web page allocation", "Web page count", "Work Team level"},
'Britto_2016': {"Centralized", "distributed", "Early", "Estimator", "Early & Late", "Estimator & Provider", "geographic distance", "geographic distance", "late", "legal entity", "location", "provider", "semi-distributed", "temporal distance", "temporal distance"},
'Dasthi': {"ANN", "Analogy Base", "COCOMO", "Evolutionary", "Expert Judgment", "FUZZY", "SEER-SEM", "SLIM", "Swarm"},
'Mendes': {"Absolute", "both", "complexity", "functionality", "Directly", "Early size metric", "Empirically", "indirectly", "interval", "Length", "late size metric", "media", "none", "Nominal", "nonspecific", "ordinal", "other", "Problem oriented metric", "program/script", "ratio", "solution oriented metric", "Specific", "theoretically", "Web application", "Web hypermedia application", "Web software application"},
'Usman': {"Analysis", "all", "analogy", "availability", "bidding", "Close Onshore", "Co-located", "Communications industry", "Considered", "crystal", "customized XP", "customized scrum", "daily", "design", "distribution", "education", "expert judgement", "DSDM", "Distant Onshore", "Estimate value(s)", "FDD", "Far Offshore", "financial", "function points", "Hours/days", "health", "ideal hours", "implementation", "kanban", "maintainability", "maintenance", "manufacturing", "MMRE", "MdMRE", "Near Offshore", "No. of team members", "not applicable", "not considered", "not used", "Other", "Performance", "Planning poker", "Point", "pair days", "Release", "reliability", "retail/wholesale", "Single", "scrum", "security", "sprint", "Story points", "testing", "three point", "task", "transportation", "UC points", "User story", "Value", "XP"}
}
# Step 2: Flatten the sets into a dataframe (assuming 'sets' is already defined)
words = []
labels = []
for label, words_set in sets.items():
for word in words_set:
words.append(word)
labels.append(label)
# Create a dataframe
df = pd.DataFrame({'Word': words, 'Set': labels})
# Step 3: Load the pre-trained model and tokenizer
model_name = "jinaai/jina-embeddings-v3"
if 'model' not in locals() or 'tokenizer' not in locals():
print("Loading model and tokenizer...")
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
print("Model and tokenizer are already loaded.")
## Loading model and tokenizer...
# Step 4: Get the embeddings for each word
def get_embeddings(word):
inputs = tokenizer(word, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
embeddings = np.array([get_embeddings(word) for word in df['Word']])
# Step 5: Perform t-SNE (now in 2D)
tsne = TSNE(n_components=2, perplexity=30, random_state=5)
embeddings_2d = tsne.fit_transform(embeddings)
## C:\Users\mysit\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\backend\context.py:136: UserWarning: Could not find the number of physical cores for the following reason:
## found 0 physical cores < 1
## Returning the number of logical cores instead. You can silence this warning by setting LOKY_MAX_CPU_COUNT to the number of cores you want to use.
## warnings.warn(
## File "C:\Users\mysit\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
## raise ValueError(f"found {cpu_count_physical} physical cores < 1")
# Step 6: Convert string labels to numeric labels for coloring
label_encoder = LabelEncoder()
numeric_labels = label_encoder.fit_transform(labels)
# Step 7: Create the 2D scatter plot
fig, ax = plt.subplots(figsize=(10, 7))
# Plot the 2D scatter with the numeric labels for colors
scatter = ax.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1],
c=numeric_labels, cmap='Set1', s=100)
# Annotate each point with the word
for i, word in enumerate(df['Word']):
ax.text(embeddings_2d[i, 0] + 0.1, embeddings_2d[i, 1] + 0.1, word, fontsize=9)
# Step 8: Add labels and title
ax.set_title("2D t-SNE Visualization of Word Embeddings")
ax.set_xlabel("t-SNE Dimension 1")
ax.set_ylabel("t-SNE Dimension 2")
# Step 9: Move the legend outside of the plot
legend_labels = label_encoder.classes_
handles = [plt.Line2D([0], [0], marker='o', color='w',
markerfacecolor=plt.cm.Set2(i / len(legend_labels)), markersize=5)
for i in range(len(legend_labels))]
ax.legend(handles, legend_labels, title="Set", loc="center left", bbox_to_anchor=(1.05, 0.5), borderaxespad=0.)
# Step 10: Show the plot
plt.tight_layout() # Ensures proper spacing with the legend outside
plt.savefig('word_embeddings.png', dpi=300, bbox_inches='tight')
plt.show()
#3D PLOT
import plotly.express as px
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
import umap.umap_ as umap
from sklearn.preprocessing import LabelEncoder
plt.clf()
plt.style.use('seaborn-v0_8-whitegrid') # You can change this to any available style
plt.rcParams['font.family'] = 'serif'
# Step 2: Flatten the sets into a dataframe (assuming sets is already defined)
words = []
labels = []
for label, words_set in sets.items():
for word in words_set:
words.append(word)
labels.append(label)
# Create a dataframe
df = pd.DataFrame({'Word': words, 'Set': labels})
# Step 3: Load the pre-trained model and tokenizer
model_name = "jinaai/jina-embeddings-v3"
if 'model' not in locals() or 'tokenizer' not in locals():
print("Loading model and tokenizer...")
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
print("Model and tokenizer are already loaded.")
## Model and tokenizer are already loaded.
# Step 4: Get the embeddings for each word
def get_embeddings(word):
inputs = tokenizer(word, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
embeddings = np.array([get_embeddings(word) for word in df['Word']])
# Step 5: Perform 3D UMAP (with 3 components)
umap_model = umap.UMAP(n_components=3, random_state=5)
embeddings_3d = umap_model.fit_transform(embeddings)
## C:\Users\mysit\AppData\Local\Programs\Python\Python39\lib\site-packages\umap\umap_.py:1952: UserWarning:
##
## n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
# Step 6: Convert string labels to numeric labels for coloring
label_encoder = LabelEncoder()
numeric_labels = label_encoder.fit_transform(labels)
# Step 7: Create the interactive 3D plot with Plotly
fig = px.scatter_3d(df, x=embeddings_3d[:, 0], y=embeddings_3d[:, 1], z=embeddings_3d[:, 2],
color=labels, text=words,
labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2', 'z': 'UMAP Dimension 3'},
title="3D UMAP Visualization of Word Embeddings")
# Customize the layout for better viewing
fig.update_traces(marker=dict(size=5, opacity=0.8), selector=dict(mode='markers+text'))
fig.update_layout(scene=dict(xaxis_title='UMAP Dimension 1',
yaxis_title='UMAP Dimension 2',
zaxis_title='UMAP Dimension 3'))
plt.savefig('3d_word_embedding.png', dpi=300, bbox_inches='tight')
# Show the interactive plot
fig.show()
import umap.umap_ as umap
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib.lines import Line2D # Add this import at the top of your code
plt.clf()
plt.style.use('seaborn-v0_8-whitegrid') # You can change this to any available style
plt.rcParams['font.family'] = 'serif'
# Step 1: Define the sets
Bajta = {"Agile", "Analysis", "Availability", "Baseline comparison", "Bidding", "CBR", "CMMI", "COCOMO", "Commissioning", "Conceptualization", "Delphi", "Detail planning", "Design", "Distant onshore", "Expert judgment", "Estimated value", "Execution", "Effort hours", "Feasibility study", "Finance", "Fuzzy similarity", "GA", "Group-based estimation", "Healthcare", "Hardware", "Implementation", "Individual", "Machine learning", "Maintainability", "Maintenance", "Near offshore", "Non-machine learning", "Not considered", "Number of team members", "Performance", "Portfolio", "Preliminary planning", "Reliability", "Research & development", "Risk", "Security", "Sensitivity analysis", "Size report", "Socio-cultural distance", "Statistical analysis", "Staff/cost", "System investigation", "Temporal distance", "Testing", "Value", "Variation reduction"}
Britto_2017 = {"Accessibility level", "Adaptation complexity", "Anchor count", "Architecture", "Association center slot count", "Association slot size", "Attribute count", "Authoring tool type", "Availability level", "Class complexity", "Class coupling", "Client script count", "Cluster count", "Cluster node size", "Cluster slot count", "Cohesion", "Cohesion complexity", "Collection center slot count", "Collection slot size", "Comment count", "Communication level", "Compactness", "Component complexity", "Component count", "Component granularity level", "Component slot count", "Concern coupling", "Concern module count", "Concern operation count", "Concurrency level", "Connectivity density", "Control flow complexity", "Cyclomatic complexity", "Data Web points", "Data flow complexity", "Data usage complexity", "Database size", "Deployment platform experience level", "Design volatility", "Development restriction", "Difficulty level", "Diffusion cut count", "Documentation level", "Domain experience level", "Entity count", "Experience level", "Feature count", "Flexibility level", "Focus factor", "High feature count", "IT literacy", "In-house experience", "Indifferent concern count", "Information slot count", "Infrastructure", "Inner/sub concern count", "Innovation level", "Input complexity", "Installability level", "Integration with legacy systems", "Interface complexity", "International Function Point Users Group", "Layout complexity", "Lessons learned repository", "Lines of code", "Link count", "Low feature count", "Maintainability level", "Mapped workflows", "Media allocation", "Media count", "Media duration", "Memory efficiency level", "Metrics program", "Model association complexity", "Model collection complexity", "Model link complexity", "Model node size", "Model slot size", "Modularity level", "Module attribute count", "Module count", "Module point cut count", "Motivation level", "New Web page count", "New complexity", "New media count", "Node count", "Node slot size", "Novelty level", "Number of programming languages", "Number of projects in parallel", "OO experience level", "Object-Oriented Function Points", "Operation count", "Operational mode", "Output complexity", "Page complexity", "Personality", "Platform support level", "Platform volatility level", "Portability level", "Process efficiency level", "Processing requirements", "Productivity level", "Program count", "Programming language experience level", "Project management level", "Publishing model unit count", "Publishing unit count", "Quality level", "Rapid app development", "Readability level", "Reliability level", "Requirements clarity level", "Requirements novelty level", "Requirements volatility level", "Resource level", "Reusability level", "Reused comment count", "Reused component count", "Reused high feature count", "Reused lines of code", "Reused low feature count", "Reused media allocation", "Reused media count", "Reused program count", "Risk level", "Robustness level", "SPI program", "Scalability level", "Section count", "Security level", "Segment count", "Semantic association count", "Server script count", "Slot count", "Slot granularity level", "Software development experience", "Software reuse", "Stability level", "Statement count", "Storage constraint", "Structure", "Team capability", "Team size", "Technical factors", "Testability level", "Time efficiency level", "Time restriction", "Tool experience level", "Total complexity", "Trainability level", "Type", "Usability level", "Use case count", "Web objects", "Web page allocation", "Web page count", "Work Team level"}
Britto_2016 = {"Centralized", "distributed", "Early", "Estimator", "Early & Late", "Estimator & Provider", "geographic distance", "geographic distance", "late", "legal entity", "location", "provider", "semi-distributed", "temporal distance", "temporal distance"}
Dasthi = {"ANN", "Analogy Base", "COCOMO", "Evolutionary", "Expert Judgment", "FUZZY", "SEER-SEM", "SLIM", "Swarm"}
Mendes = {"Absolute", "both", "complexity", "functionality", "Directly", "Early size metric", "Empirically", "indirectly", "interval", "Length", "late size metric", "media", "none", "Nominal", "nonspecific", "ordinal", "other", "Problem oriented metric", "program/script", "ratio", "solution oriented metric", "Specific", "theoretically", "Web application", "Web hypermedia application", "Web software application"}
Usman = {"Analysis", "all", "analogy", "availability", "bidding", "Close Onshore", "Co-located", "Communications industry", "Considered", "crystal", "customized XP", "customized scrum", "daily", "design", "distribution", "education", "expert judgement", "DSDM", "Distant Onshore", "Estimate value(s)", "FDD", "Far Offshore", "financial", "function points", "Hours/days", "health", "ideal hours", "implementation", "kanban", "maintainability", "maintenance", "manufacturing", "MMRE", "MdMRE", "Near Offshore", "No. of team members", "not applicable", "not considered", "not used", "Other", "Performance", "Planning poker", "Point", "pair days", "Release", "reliability", "retail/wholesale", "Single", "scrum", "security", "sprint", "Story points", "testing", "three point", "task", "transportation", "UC points", "User story", "Value", "XP"}
# Combine the sets into a dictionary
sets = {
'Bajta': Bajta,
'Britto_2017': Britto_2017,
'Britto_2016': Britto_2016,
'Dasthi': Dasthi,
'Mendes': Mendes,
'Usman': Usman
}
# Step 2: Flatten the sets into a dataframe
words = []
labels = []
for label, words_set in sets.items():
for word in words_set:
words.append(word.lower())
labels.append(label)
# Create a dataframe
df = pd.DataFrame({'Word': words, 'Set': labels})
# Step 3: Load the pre-trained model and tokenizer
model_name = "jinaai/jina-embeddings-v3"
if 'model' not in locals() or 'tokenizer' not in locals():
print("Loading model and tokenizer...")
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
print("Model and tokenizer are already loaded.")
## Model and tokenizer are already loaded.
# Step 4: Get the embeddings for each word
def get_embeddings(word):
inputs = tokenizer(word, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
embeddings = np.array([get_embeddings(word) for word in df['Word']])
# Step 5: Perform 2D UMAP
umap_model = umap.UMAP(n_components=2, random_state=5)
embeddings_2d = umap_model.fit_transform(embeddings)
## C:\Users\mysit\AppData\Local\Programs\Python\Python39\lib\site-packages\umap\umap_.py:1952: UserWarning:
##
## n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
# Step 6: Create a color map that reflects the set labels
unique_labels = list(df['Set'].unique()) # Get the unique set labels
cmap = plt.cm.get_cmap('tab10', len(unique_labels)) # Create a colormap with enough colors
## <string>:1: MatplotlibDeprecationWarning:
##
## The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
# Step 7: Run K-means on UMAP embeddings
num_clusters = len(unique_labels) # Set number of clusters to match unique labels
kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=5)
kmeans_labels = kmeans.fit_predict(embeddings_2d)
# Step 8: Generate top 4 names for each cluster
top_n = 3 # Set how many top words to display for each cluster
cluster_names = []
for i in range(num_clusters):
# Get the embeddings for words in the current cluster
cluster_indices = np.where(kmeans_labels == i)[0]
cluster_embeddings = embeddings[cluster_indices]
# Calculate the centroid of the cluster
cluster_centroid = np.mean(cluster_embeddings, axis=0).reshape(1, -1)
# Calculate cosine similarity of centroid to all words' embeddings to find closest words
similarities = cosine_similarity(cluster_centroid, embeddings).flatten()
# Get the indices of the top 4 closest words
closest_word_indices = np.argsort(similarities)[-top_n:][::-1] # Get indices of top 4 closest words
# Get the words corresponding to these indices
closest_words = df['Word'].iloc[closest_word_indices].tolist()
# Store the top 4 closest words as the cluster name
cluster_names.append(closest_words)
# Step 9: Plot with translucent shapes for each K-means cluster and annotate with top 4 names
plt.figure(figsize=(10, 7))
color_map = {label: cmap(i) for i, label in enumerate(unique_labels)}
# Plot the 2D scatter with the original labels for colors
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1],
c=[color_map[label] for label in df['Set']], s=50)
# Draw convex hulls around each cluster and annotate with cluster names
for i in range(num_clusters):
cluster_points = embeddings_2d[kmeans_labels == i]
if len(cluster_points) >= 3: # ConvexHull requires at least 3 points
hull = ConvexHull(cluster_points)
hull_points = cluster_points[hull.vertices]
plt.fill(hull_points[:, 0], hull_points[:, 1], alpha=0.2,
color=cmap(i), label=f'Cluster {i+1}')
# Annotate with the top 4 cluster names at the centroid location
cluster_centroid_2d = np.mean(cluster_points, axis=0)
# Join the top 4 words into a string with commas for cleaner display
cluster_name_text = ', '.join(cluster_names[i])
# Annotate with the top words at the centroid, with slightly smaller font size
plt.text(cluster_centroid_2d[0], cluster_centroid_2d[1], cluster_name_text,
fontsize=7, ha='center', color='black', fontweight='bold')
# Step 10: Add plot title and axis labels
plt.title("2D UMAP Visualization of Word Embeddings with K-means Clusters")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
# Step 11: Show the plot with the updated legend
legend_handles = [Line2D([0], [0], marker='o', color='w',
markerfacecolor=cmap(i), markersize=10)
for i, label in enumerate(unique_labels)]
plt.legend(legend_handles, unique_labels, title="Literature", loc="center left", bbox_to_anchor=(1, 0.5))
# Adjust layout to ensure the legend is not clipped
plt.tight_layout()
# Step 10: Save the plot in high resolution
plt.savefig('word_embeddings_kmeans.png', dpi=300, bbox_inches='tight')
# Show the plot
plt.show()
import torch
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
plt.clf()
plt.style.use('seaborn-v0_8-whitegrid') # You can change this to any available style
plt.rcParams['font.family'] = 'serif'
# Define the sets of words
Bajta = {"Agile", "Analysis", "Availability", "Baseline comparison", "Bidding", "CBR", "CMMI", "COCOMO", "Commissioning", "Conceptualization", "Delphi", "Detail planning", "Design", "Distant onshore", "Expert judgment", "Estimated value", "Execution", "Effort hours", "Feasibility study", "Finance", "Fuzzy similarity", "GA", "Group-based estimation", "Healthcare", "Hardware", "Implementation", "Individual", "Machine learning", "Maintainability", "Maintenance", "Near offshore", "Non-machine learning", "Not considered", "Number of team members", "Performance", "Portfolio", "Preliminary planning", "Reliability", "Research & development", "Risk", "Security", "Sensitivity analysis", "Size report", "Socio-cultural distance", "Statistical analysis", "Staff/cost", "System investigation", "Temporal distance", "Testing", "Value", "Variation reduction"}
Britto_2017 = {"Accessibility level", "Adaptation complexity", "Anchor count", "Architecture", "Association center slot count", "Association slot size", "Attribute count", "Authoring tool type", "Availability level", "Class complexity", "Class coupling", "Client script count", "Cluster count", "Cluster node size", "Cluster slot count", "Cohesion", "Cohesion complexity", "Collection center slot count", "Collection slot size", "Comment count", "Communication level", "Compactness", "Component complexity", "Component count", "Component granularity level", "Component slot count", "Concern coupling", "Concern module count", "Concern operation count", "Concurrency level", "Connectivity density", "Control flow complexity", "Cyclomatic complexity", "Data Web points", "Data flow complexity", "Data usage complexity", "Database size", "Deployment platform experience level", "Design volatility", "Development restriction", "Difficulty level", "Diffusion cut count", "Documentation level", "Domain experience level", "Entity count", "Experience level", "Feature count", "Flexibility level", "Focus factor", "High feature count", "IT literacy", "In-house experience", "Indifferent concern count", "Information slot count", "Infrastructure", "Inner/sub concern count", "Innovation level", "Input complexity", "Installability level", "Integration with legacy systems", "Interface complexity", "International Function Point Users Group", "Layout complexity", "Lessons learned repository", "Lines of code", "Link count", "Low feature count", "Maintainability level", "Mapped workflows", "Media allocation", "Media count", "Media duration", "Memory efficiency level", "Metrics program", "Model association complexity", "Model collection complexity", "Model link complexity", "Model node size", "Model slot size", "Modularity level", "Module attribute count", "Module count", "Module point cut count", "Motivation level", "New Web page count", "New complexity", "New media count", "Node count", "Node slot size", "Novelty level", "Number of programming languages", "Number of projects in parallel", "OO experience level", "Object-Oriented Function Points", "Operation count", "Operational mode", "Output complexity", "Page complexity", "Personality", "Platform support level", "Platform volatility level", "Portability level", "Process efficiency level", "Processing requirements", "Productivity level", "Program count", "Programming language experience level", "Project management level", "Publishing model unit count", "Publishing unit count", "Quality level", "Rapid app development", "Readability level", "Reliability level", "Requirements clarity level", "Requirements novelty level", "Requirements volatility level", "Resource level", "Reusability level", "Reused comment count", "Reused component count", "Reused high feature count", "Reused lines of code", "Reused low feature count", "Reused media allocation", "Reused media count", "Reused program count", "Risk level", "Robustness level", "SPI program", "Scalability level", "Section count", "Security level", "Segment count", "Semantic association count", "Server script count", "Slot count", "Slot granularity level", "Software development experience", "Software reuse", "Stability level", "Statement count", "Storage constraint", "Structure", "Team capability", "Team size", "Technical factors", "Testability level", "Time efficiency level", "Time restriction", "Tool experience level", "Total complexity", "Trainability level", "Type", "Usability level", "Use case count", "Web objects", "Web page allocation", "Web page count", "Work Team level"}
Britto_2016 = {"Centralized", "distributed", "Early", "Estimator", "Early & Late", "Estimator & Provider", "geographic distance", "geographic distance", "late", "legal entity", "location", "provider", "semi-distributed", "temporal distance", "temporal distance"}
Dasthi = {"ANN", "Analogy Base", "COCOMO", "Evolutionary", "Expert Judgment", "FUZZY", "SEER-SEM", "SLIM", "Swarm"}
Mendes = {"Absolute", "both", "complexity", "functionality", "Directly", "Early size metric", "Empirically", "indirectly", "interval", "Length", "late size metric", "media", "none", "Nominal", "nonspecific", "ordinal", "other", "Problem oriented metric", "program/script", "ratio", "solution oriented metric", "Specific", "theoretically", "Web application", "Web hypermedia application", "Web software application"}
Usman = {"Analysis", "all", "analogy", "availability", "bidding", "Close Onshore", "Co-located", "Communications industry", "Considered", "crystal", "customized XP", "customized scrum", "daily", "design", "distribution", "education", "expert judgement", "DSDM", "Distant Onshore", "Estimate value(s)", "FDD", "Far Offshore", "financial", "function points", "Hours/days", "health", "ideal hours", "implementation", "kanban", "maintainability", "maintenance", "manufacturing", "MMRE", "MdMRE", "Near Offshore", "No. of team members", "not applicable", "not considered", "not used", "Other", "Performance", "Planning poker", "Point", "pair days", "Release", "reliability", "retail/wholesale", "Single", "scrum", "security", "sprint", "Story points", "testing", "three point", "task", "transportation", "UC points", "User story", "Value", "XP"}
# Combine all sets into a single list with labels
word_sets = {
"Bajta": Bajta,
"Britto_2016": Britto_2016,
"Britto_2017": Britto_2017,
"Dasthi": Dasthi,
"Mendes": Mendes,
"Usman": Usman
}
word_sets = {label: {word.lower() for word in words} for label, words in word_sets.items()}
# Load model and tokenizer
model_name = "jinaai/jina-embeddings-v3"
if 'model' not in locals() or 'tokenizer' not in locals():
print("Loading model and tokenizer...")
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
print("Model and tokenizer are already loaded.")
## Model and tokenizer are already loaded.
# Function to get embedding for a word
def get_embedding(word):
inputs = tokenizer(word, return_tensors="pt")
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).detach().numpy()
# Collect embeddings
embeddings = []
words = []
labels = []
for label, words_set in word_sets.items():
for word in words_set:
embedding = get_embedding(word)
embeddings.append(embedding)
words.append(word)
labels.append(label)
# Create a DataFrame with words, labels, and embeddings
embedding_df = pd.DataFrame({
"Word": words,
"Label": labels,
"Embedding": [emb[0] for emb in embeddings]
})
# Pivot the DataFrame to have the set labels as columns
pivoted_df = embedding_df.pivot(index="Word", columns="Label", values="Embedding")
# Flatten the embeddings (if you want to display them properly as vectors, you might want to separate them)
# Convert the embedding vectors to string for display purposes (or keep them as arrays if you're working with them in computations)
pivoted_df = pivoted_df.applymap(lambda x: str(x.tolist()) if isinstance(x, np.ndarray) else x)
## <string>:4: FutureWarning:
##
## DataFrame.applymap has been deprecated. Use DataFrame.map instead.
# Display the pivoted DataFrame
print(pivoted_df)
## Label Bajta ... Usman
## Word ...
## absolute NaN ... NaN
## accessibility level NaN ... NaN
## adaptation complexity NaN ... NaN
## agile [2.776266098022461, -2.1827030181884766, 1.469... ... NaN
## all NaN ... [1.5750207901000977, -2.3228142261505127, 1.04...
## ... ... ... ...
## web page allocation NaN ... NaN
## web page count NaN ... NaN
## web software application NaN ... NaN
## work team level NaN ... NaN
## xp NaN ... [2.5570878982543945, -1.4092556238174438, -0.1...
##
## [300 rows x 6 columns]
import torch
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
plt.clf()
plt.style.use('seaborn-v0_8-whitegrid') # You can change this to any available style
plt.rcParams['font.family'] = 'serif'
# Define the sets of words
Bajta = {"Agile", "Analysis", "Availability", "Baseline comparison", "Bidding", "CBR", "CMMI", "COCOMO", "Commissioning", "Conceptualization", "Delphi", "Detail planning", "Design", "Distant onshore", "Expert judgment", "Estimated value", "Execution", "Effort hours", "Feasibility study", "Finance", "Fuzzy similarity", "GA", "Group-based estimation", "Healthcare", "Hardware", "Implementation", "Individual", "Machine learning", "Maintainability", "Maintenance", "Near offshore", "Non-machine learning", "Not considered", "Number of team members", "Performance", "Portfolio", "Preliminary planning", "Reliability", "Research & development", "Risk", "Security", "Sensitivity analysis", "Size report", "Socio-cultural distance", "Statistical analysis", "Staff/cost", "System investigation", "Temporal distance", "Testing", "Value", "Variation reduction"}
Britto_2017 = {"Accessibility level", "Adaptation complexity", "Anchor count", "Architecture", "Association center slot count", "Association slot size", "Attribute count", "Authoring tool type", "Availability level", "Class complexity", "Class coupling", "Client script count", "Cluster count", "Cluster node size", "Cluster slot count", "Cohesion", "Cohesion complexity", "Collection center slot count", "Collection slot size", "Comment count", "Communication level", "Compactness", "Component complexity", "Component count", "Component granularity level", "Component slot count", "Concern coupling", "Concern module count", "Concern operation count", "Concurrency level", "Connectivity density", "Control flow complexity", "Cyclomatic complexity", "Data Web points", "Data flow complexity", "Data usage complexity", "Database size", "Deployment platform experience level", "Design volatility", "Development restriction", "Difficulty level", "Diffusion cut count", "Documentation level", "Domain experience level", "Entity count", "Experience level", "Feature count", "Flexibility level", "Focus factor", "High feature count", "IT literacy", "In-house experience", "Indifferent concern count", "Information slot count", "Infrastructure", "Inner/sub concern count", "Innovation level", "Input complexity", "Installability level", "Integration with legacy systems", "Interface complexity", "International Function Point Users Group", "Layout complexity", "Lessons learned repository", "Lines of code", "Link count", "Low feature count", "Maintainability level", "Mapped workflows", "Media allocation", "Media count", "Media duration", "Memory efficiency level", "Metrics program", "Model association complexity", "Model collection complexity", "Model link complexity", "Model node size", "Model slot size", "Modularity level", "Module attribute count", "Module count", "Module point cut count", "Motivation level", "New Web page count", "New complexity", "New media count", "Node count", "Node slot size", "Novelty level", "Number of programming languages", "Number of projects in parallel", "OO experience level", "Object-Oriented Function Points", "Operation count", "Operational mode", "Output complexity", "Page complexity", "Personality", "Platform support level", "Platform volatility level", "Portability level", "Process efficiency level", "Processing requirements", "Productivity level", "Program count", "Programming language experience level", "Project management level", "Publishing model unit count", "Publishing unit count", "Quality level", "Rapid app development", "Readability level", "Reliability level", "Requirements clarity level", "Requirements novelty level", "Requirements volatility level", "Resource level", "Reusability level", "Reused comment count", "Reused component count", "Reused high feature count", "Reused lines of code", "Reused low feature count", "Reused media allocation", "Reused media count", "Reused program count", "Risk level", "Robustness level", "SPI program", "Scalability level", "Section count", "Security level", "Segment count", "Semantic association count", "Server script count", "Slot count", "Slot granularity level", "Software development experience", "Software reuse", "Stability level", "Statement count", "Storage constraint", "Structure", "Team capability", "Team size", "Technical factors", "Testability level", "Time efficiency level", "Time restriction", "Tool experience level", "Total complexity", "Trainability level", "Type", "Usability level", "Use case count", "Web objects", "Web page allocation", "Web page count", "Work Team level"}
Britto_2016 = {"Centralized", "distributed", "Early", "Estimator", "Early & Late", "Estimator & Provider", "geographic distance", "geographic distance", "late", "legal entity", "location", "provider", "semi-distributed", "temporal distance", "temporal distance"}
Dasthi = {"ANN", "Analogy Base", "COCOMO", "Evolutionary", "Expert Judgment", "FUZZY", "SEER-SEM", "SLIM", "Swarm"}
Mendes = {"Absolute", "both", "complexity", "functionality", "Directly", "Early size metric", "Empirically", "indirectly", "interval", "Length", "late size metric", "media", "none", "Nominal", "nonspecific", "ordinal", "other", "Problem oriented metric", "program/script", "ratio", "solution oriented metric", "Specific", "theoretically", "Web application", "Web hypermedia application", "Web software application"}
Usman = {"Analysis", "all", "analogy", "availability", "bidding", "Close Onshore", "Co-located", "Communications industry", "Considered", "crystal", "customized XP", "customized scrum", "daily", "design", "distribution", "education", "expert judgement", "DSDM", "Distant Onshore", "Estimate value(s)", "FDD", "Far Offshore", "financial", "function points", "Hours/days", "health", "ideal hours", "implementation", "kanban", "maintainability", "maintenance", "manufacturing", "MMRE", "MdMRE", "Near Offshore", "No. of team members", "not applicable", "not considered", "not used", "Other", "Performance", "Planning poker", "Point", "pair days", "Release", "reliability", "retail/wholesale", "Single", "scrum", "security", "sprint", "Story points", "testing", "three point", "task", "transportation", "UC points", "User story", "Value", "XP"}
# Create a dictionary to store the sets
sets = {
"Bajta": Bajta,
"Britto_2016": Britto_2016,
"Britto_2017": Britto_2017,
"Dasthi": Dasthi,
"Mendes": Mendes,
"Usman": Usman
}
# Load the pre-trained model and tokenizer
model_name = "jinaai/jina-embeddings-v3"
if 'model' not in locals() or 'tokenizer' not in locals():
print("Loading model and tokenizer...")
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
print("Model and tokenizer are already loaded.")
## Model and tokenizer are already loaded.
# Function to normalize text to lowercase
def normalize_words(words):
return {word.lower() for word in words}
# Normalize all words in the sets to lowercase
normalized_sets = {set_name: normalize_words(word_set) for set_name, word_set in sets.items()}
# Function to get embeddings for a list of words
def get_embeddings(words):
inputs = tokenizer(list(words), padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
embeddings = model(**inputs).last_hidden_state.mean(dim=1) # Mean pooling
return embeddings
# Create a dictionary to store the embeddings of each set
embeddings = {}
for set_name, word_set in normalized_sets.items():
embeddings[set_name] = get_embeddings(word_set)
# Create a function to calculate the semantic similarity between sets
def compute_similarity(set1, set2):
# Get the embeddings for both sets
embeddings1 = embeddings[set1]
embeddings2 = embeddings[set2]
# Calculate cosine similarity between all pairs of words in set1 and set2
sim_matrix = cosine_similarity(embeddings1, embeddings2)
return sim_matrix
# Create a similarity matrix for each pair of sets
similarity_results = {}
for set1 in normalized_sets.keys():
for set2 in normalized_sets.keys():
if set1 != set2:
sim_matrix = compute_similarity(set1, set2)
similarity_results[(set1, set2)] = sim_matrix
# Create a simple table to store the similarity values
similarity_table = []
# Populate the table with word pairs and their cosine similarity values
for (set1, set2), sim_matrix in similarity_results.items():
for i, word1 in enumerate(normalized_sets[set1]):
for j, word2 in enumerate(normalized_sets[set2]):
similarity_table.append({
"Set 1": set1,
"Word 1": word1,
"Set 2": set2,
"Word 2": word2,
"Cosine Similarity": sim_matrix[i, j]
})
# Convert the table to a DataFrame for better display
similarity_df = pd.DataFrame(similarity_table)
# Filter the DataFrame to keep only cosine similarities above 0.7
similarity_df_filtered = similarity_df[similarity_df['Cosine Similarity'] > 0.7]
# Create an empty table to store the words that are similar
common_words_table = pd.DataFrame(index=sets.keys(), columns=sets.keys(), dtype=object)
# Populate the table with word pairs that have similarity above 0.7
for index, row in similarity_df_filtered.iterrows():
set1 = row['Set 1']
word1 = row['Word 1']
set2 = row['Set 2']
word2 = row['Word 2']
# Check if the cell is empty or needs to be updated with word pairs
if pd.isna(common_words_table.at[set1, set2]):
common_words_table.at[set1, set2] = f"{word1} - {word2}"
else:
common_words_table.at[set1, set2] += f", {word1} - {word2}"
# Display the table showing the common word pairs
print(common_words_table)
## Bajta ... Usman
## Bajta NaN ... cocomo - co-located, maintenance - maintenance...
## Britto_2016 temporal distance - temporal distance, geograp... ... semi-distributed - distribution, location - co...
## Britto_2017 quality level - performance, process efficienc... ... quality level - performance, process efficienc...
## Dasthi cocomo - cocomo, fuzzy - fuzzy similarity, exp... ... cocomo - co-located, analogy base - analogy, e...
## Mendes none - not considered, length - size report, l... ... none - all, none - not applicable, none - othe...
## Usman hours/days - effort hours, co-located - cocomo... ... NaN
##
## [6 rows x 6 columns]
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
plt.clf()
plt.style.use('seaborn-v0_8-whitegrid') # You can change this to any available style
plt.rcParams['font.family'] = 'serif'
df = similarity_df_filtered
# Group by Set 1 and Set 2 and count the number of shared words
count_table = df.groupby(["Set 1", "Set 2"]).size().reset_index(name="Shared Word Count")
# Pivot the DataFrame to create a matrix of shared word counts
pivot_count = count_table.pivot(index="Set 1", columns="Set 2", values="Shared Word Count").fillna(0)
# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_count, annot=True, cmap="Blues", cbar_kws={'label': 'Number of Shared Words'})
plt.title("Heatmap of Shared Words between Literature")
plt.xlabel("Literature")
plt.ylabel("Literature")
plt.xticks(rotation=45, ha="right")
## (array([0.5, 1.5, 2.5, 3.5, 4.5, 5.5]), [Text(0.5, 0, 'Bajta'), Text(1.5, 0, 'Britto_2016'), Text(2.5, 0, 'Britto_2017'), Text(3.5, 0, 'Dasthi'), Text(4.5, 0, 'Mendes'), Text(5.5, 0, 'Usman')])
plt.yticks(rotation=0)
## (array([0.5, 1.5, 2.5, 3.5, 4.5, 5.5]), [Text(0, 0.5, 'Bajta'), Text(0, 1.5, 'Britto_2016'), Text(0, 2.5, 'Britto_2017'), Text(0, 3.5, 'Dasthi'), Text(0, 4.5, 'Mendes'), Text(0, 5.5, 'Usman')])
plt.tight_layout()
plt.savefig('word_counts.png', dpi=300, bbox_inches='tight')
plt.show()